Replace tsc_native config option with tsc_mode config option
authorKeir Fraser <keir.fraser@citrix.com>
Wed, 25 Nov 2009 14:05:28 +0000 (14:05 +0000)
committerKeir Fraser <keir.fraser@citrix.com>
Wed, 25 Nov 2009 14:05:28 +0000 (14:05 +0000)
(NOTE: pvrdtscp mode not finished yet, but all other
modes have been tested so sooner seemed better than
later to submit this fairly major patch so we can get
more mileage on it before next release.)

New tsc_mode config option supercedes tsc_native and
offers a more intelligent default and an additional
option for intelligent apps running on PV domains
("pvrdtscp").

For PV domains, default mode will determine if the initial
host has a "safe"** TSC (meaning it is always synchronized
across all physical CPUs).  If so, all domains will
execute all rdtsc instructions natively; if not,
all domains will emulate all rdtsc instructions but
providing the TSC hertz rate of the initial machine.
After being restored or live-migrated, all PV domains will
emulate all rdtsc instructions.  Hence, this default mode
guarantees correctness while providing native performance
in most conditions.

For PV domains, tsc_mode==1 will always emulate rdtsc
and tsc_mode==2 will never emulate rdtsc.  For tsc_mode==3,
rdtsc will never be emulated, but information is provided
through pvcpuid instructions and rdtscp instructions
so that an app can obtain "safe" pvclock-like TSC information
across save/restore and live migration. (Will be completed in
a follow-on patch.)

For HVM domains, the default mode and "always emulate"
mode do the same as tsc_native==0; the other two modes
do the same as tsc_native==1.  (HVM domains since 3.4
have implemented a tsc_mode=default-like functionality,
but also can preserve native TSC across save/restore
and live-migration IFF the initial and target machines
have a common TSC cycle rate.)

** All newer AMD machines, and Nehalem and future Intel
machines have "Invariant TSC"; many newer Intel machines
have "Constant TSC" and do not support deep-C sleep states;
these and all single-processor machines are "safe".

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>
20 files changed:
tools/examples/xmexample.hvm
tools/libxc/xc_domain.c
tools/libxc/xc_domain_restore.c
tools/libxc/xc_domain_save.c
tools/libxc/xenctrl.h
tools/python/xen/lowlevel/xc/xc.c
tools/python/xen/xend/XendConfig.py
tools/python/xen/xend/XendDomainInfo.py
tools/python/xen/xm/create.py
tools/python/xen/xm/xenapi_create.py
xen/arch/x86/domain.c
xen/arch/x86/domctl.c
xen/arch/x86/hvm/hvm.c
xen/arch/x86/time.c
xen/arch/x86/traps.c
xen/include/asm-x86/domain.h
xen/include/asm-x86/msr.h
xen/include/asm-x86/processor.h
xen/include/asm-x86/time.h
xen/include/public/domctl.h

index e5ae97f966aa8648b8164ecf880101485a85d180..09edda6aacb317dcec81c90ab9cd1ebf0d43577c 100644 (file)
@@ -178,11 +178,16 @@ stdvga=0
 serial='pty'
 
 #----------------------------------------------------------------------------
-#   tsc_native : TSC mode (0=emulate TSC, 1=native TSC)
+#   tsc_mode : TSC mode (0=default, 1=native TSC, 2=never emulate, 3=pvrdtscp)
 #   emulate TSC provides synced TSC for all vcpus, but lose perfomrance.
 #   native TSC leverages hardware's TSC(no perf loss), but vcpu's TSC may lose
-#   sync due to hardware's unreliable/unsynced TSC between CPUs.
-tsc_native=1
+#    sync due to hardware's unreliable/unsynced TSC between CPUs.
+#   default intelligently uses native TSC on machines where it is safe, but
+#    switches to emulated if necessary after save/restore/migration
+#   pvrdtscp is for intelligent apps that use special Xen-only paravirtualized
+#    cpuid instructions to obtain offset/scaling/migration info and maximize
+#    performance within pools of machines that support the rdtscp instruction
+tsc_mode=0
 
 #-----------------------------------------------------------------------------
 #   Qemu Monitor, default is disable
index f8725839267649fc1f4ebb59171a274dba76611e..54a5914e9b23eab3a4c195ccecce93eb4ffa288b 100644 (file)
@@ -466,24 +466,61 @@ int xc_domain_set_time_offset(int xc_handle,
     return do_domctl(xc_handle, &domctl);
 }
 
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native)
+int xc_domain_disable_migrate(int xc_handle, uint32_t domid)
 {
     DECLARE_DOMCTL;
-    domctl.cmd = XEN_DOMCTL_set_tsc_native;
+    domctl.cmd = XEN_DOMCTL_disable_migrate;
     domctl.domain = (domid_t)domid;
-    domctl.u.set_tsc_native.is_native = is_native;
+    domctl.u.disable_migrate.disable = 1;
     return do_domctl(xc_handle, &domctl);
 }
 
-int xc_domain_disable_migrate(int xc_handle, uint32_t domid)
+int xc_domain_set_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t tsc_mode,
+                           uint64_t elapsed_nsec,
+                           uint32_t gtsc_khz,
+                           uint32_t incarnation)
 {
     DECLARE_DOMCTL;
-    domctl.cmd = XEN_DOMCTL_disable_migrate;
+    domctl.cmd = XEN_DOMCTL_settscinfo;
     domctl.domain = (domid_t)domid;
-    domctl.u.disable_migrate.disable = 1;
+    domctl.u.tsc_info.info.tsc_mode = tsc_mode;
+    domctl.u.tsc_info.info.elapsed_nsec = elapsed_nsec;
+    domctl.u.tsc_info.info.gtsc_khz = gtsc_khz;
+    domctl.u.tsc_info.info.incarnation = incarnation;
     return do_domctl(xc_handle, &domctl);
 }
 
+int xc_domain_get_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t *tsc_mode,
+                           uint64_t *elapsed_nsec,
+                           uint32_t *gtsc_khz,
+                           uint32_t *incarnation)
+{
+    int rc;
+    DECLARE_DOMCTL;
+    xen_guest_tsc_info_t info = { 0 };
+
+    domctl.cmd = XEN_DOMCTL_gettscinfo;
+    domctl.domain = (domid_t)domid;
+    set_xen_guest_handle(domctl.u.tsc_info.out_info, &info);
+    if ( (rc = lock_pages(&info, sizeof(info))) != 0 )
+        return rc;
+    rc = do_domctl(xc_handle, &domctl);
+    if ( rc == 0 )
+    {
+        *tsc_mode = info.tsc_mode;
+        *elapsed_nsec = info.elapsed_nsec;
+        *gtsc_khz = info.gtsc_khz;
+        *incarnation = info.incarnation;
+    }
+    unlock_pages(&info,sizeof(info));
+    return rc;
+}
+
+
 int xc_domain_memory_increase_reservation(int xc_handle,
                                           uint32_t domid,
                                           unsigned long nr_extents,
index 01d7924f07ebc4d0e8400073a331ea18f6562959..cf6a63c25abdd5225413596c89ff57196abd4afc 100644 (file)
@@ -1084,6 +1084,18 @@ static int pagebuf_get_one(pagebuf_t* buf, int fd, int xch, uint32_t dom)
             return -1;
         }
         return pagebuf_get_one(buf, fd, xch, dom);
+    } else if ( count == -7 ) {
+        uint32_t tsc_mode, khz, incarn;
+        uint64_t nsec;
+        if ( read_exact(fd, &tsc_mode, sizeof(uint32_t)) ||
+             read_exact(fd, &nsec, sizeof(uint64_t)) ||
+             read_exact(fd, &khz, sizeof(uint32_t)) ||
+             read_exact(fd, &incarn, sizeof(uint32_t)) ||
+             xc_domain_set_tsc_info(xch, dom, tsc_mode, nsec, khz, incarn) ) {
+            ERROR("error reading/restoring tsc info");
+            return -1;
+        }
+        return pagebuf_get_one(buf, fd, xch, dom);
     } else if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
         ERROR("Max batch size exceeded (%d). Giving up.", count);
         return -1;
index 30c1b6d3a41161dc5b69bde61970cb9b5c7b87c0..9d706a92d3be7fe118ab640dc0878beb4171cdd1 100644 (file)
@@ -841,6 +841,24 @@ static xen_pfn_t *map_and_save_p2m_table(int xc_handle,
     return success ? p2m : NULL;
 }
 
+/* must be done AFTER suspend_and_state() */
+static int save_tsc_info(int xc_handle, uint32_t dom, int io_fd)
+{
+    int marker = -7;
+    uint32_t tsc_mode, khz, incarn;
+    uint64_t nsec;
+
+    if ( xc_domain_get_tsc_info(xc_handle, dom, &tsc_mode,
+                                &nsec, &khz, &incarn) < 0  ||
+         write_exact(io_fd, &marker, sizeof(marker)) ||
+         write_exact(io_fd, &tsc_mode, sizeof(tsc_mode)) ||
+         write_exact(io_fd, &nsec, sizeof(nsec)) ||
+         write_exact(io_fd, &khz, sizeof(khz)) ||
+         write_exact(io_fd, &incarn, sizeof(incarn)) )
+        return -1;
+    return 0;
+}
+
 int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                    uint32_t max_factor, uint32_t flags,
                    struct save_callbacks* callbacks,
@@ -1100,6 +1118,12 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
         goto out;
     }
 
+    if ( !live && save_tsc_info(xc_handle, dom, io_fd) < 0 )
+    {
+        ERROR("Error when writing to state file (tsc)");
+        goto out;
+    }
+
   copypages:
 #define write_exact(fd, buf, len) write_buffer(last_iter, &ob, (fd), (buf), (len))
 #ifdef ratewrite
@@ -1458,6 +1482,13 @@ int xc_domain_save(int xc_handle, int io_fd, uint32_t dom, uint32_t max_iters,
                         goto out;
                 }
 
+                if ( save_tsc_info(xc_handle, dom, io_fd) < 0 )
+                {
+                    ERROR("Error when writing to state file (tsc)");
+                    goto out;
+                }
+
+
             }
 
             if ( xc_shadow_control(xc_handle, dom, 
index d6ecaf399c1b913365bbc99bff716d4a621a943e..9fc05bb30b336518e140293c5db1f214e1d60b22 100644 (file)
@@ -628,7 +628,19 @@ int xc_domain_set_time_offset(int xc_handle,
                               uint32_t domid,
                               int32_t time_offset_seconds);
 
-int xc_domain_set_tsc_native(int xc_handle, uint32_t domid, int is_native);
+int xc_domain_set_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t tsc_mode,
+                           uint64_t elapsed_nsec,
+                           uint32_t gtsc_khz,
+                           uint32_t incarnation);
+
+int xc_domain_get_tsc_info(int xc_handle,
+                           uint32_t domid,
+                           uint32_t *tsc_mode,
+                           uint64_t *elapsed_nsec,
+                           uint32_t *gtsc_khz,
+                           uint32_t *incarnation);
 
 int xc_domain_disable_migrate(int xc_handle, uint32_t domid);
 
index 7eaf63b94e866efe9461e76898f680c20ca03917..aa780aa303c87a20a84b0d067643cd3248c16b7b 100644 (file)
@@ -1486,14 +1486,14 @@ static PyObject *pyxc_domain_set_time_offset(XcObject *self, PyObject *args)
     return zero;
 }
 
-static PyObject *pyxc_domain_set_tsc_native(XcObject *self, PyObject *args)
+static PyObject *pyxc_domain_set_tsc_info(XcObject *self, PyObject *args)
 {
-    uint32_t dom, is_native;
+    uint32_t dom, tsc_mode;
 
-    if (!PyArg_ParseTuple(args, "ii", &dom, &is_native))
+    if (!PyArg_ParseTuple(args, "ii", &dom, &tsc_mode))
         return NULL;
 
-    if (xc_domain_set_tsc_native(self->xc_handle, dom, is_native) != 0)
+    if (xc_domain_set_tsc_info(self->xc_handle, dom, tsc_mode, 0, 0, 0) != 0)
         return pyxc_error_to_exception();
 
     Py_INCREF(zero);
@@ -2036,12 +2036,13 @@ static PyMethodDef pyxc_methods[] = {
       " offset     [int]: Time offset from UTC in seconds.\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
-    { "domain_set_tsc_native",
-      (PyCFunction)pyxc_domain_set_tsc_native,
+    { "domain_set_tsc_info",
+      (PyCFunction)pyxc_domain_set_tsc_info,
       METH_VARARGS, "\n"
-      "Set a domain's TSC mode (emulate vs native)\n"
+      "Set a domain's TSC mode\n"
       " dom        [int]: Domain whose TSC mode is being set.\n"
-      " is_native  [int]: 1=native, 0=emulate.\n"
+      " tsc_mode   [int]: 0=default (monotonic, but native where possible)\n"
+      "                   1=always emulate 2=never emulate 3=pvrdtscp\n"
       "Returns: [int] 0 on success; -1 on error.\n" },
 
     { "domain_disable_migrate",
index 0eadf343d3951aeff3e7ee60b87317d76a997204..3227cd4def0467b961b8d311dceaa0bc23ca391b 100644 (file)
@@ -163,7 +163,7 @@ XENAPI_PLATFORM_CFG_TYPES = {
     'vncdisplay': int,
     'vnclisten': str,
     'timer_mode': int,
-    'tsc_native': int,
+    'tsc_mode': int,
     'vpt_align': int,
     'viridian': int,
     'vncpasswd': str,
@@ -477,8 +477,8 @@ class XendConfig(dict):
             if not os.path.exists(self['platform']['device_model']):
                 raise VmError("device model '%s' not found" % str(self['platform']['device_model']))
 
-        if 'tsc_native' not in self['platform']:
-            self['platform']['tsc_native'] = 0
+        if 'tsc_mode' not in self['platform']:
+            self['platform']['tsc_mode'] = 0
 
         if 'nomigrate' not in self['platform']:
             self['platform']['nomigrate'] = 0
index 592ba6fad8a69f6ebb8d64119b84b45fab294755..8198228b9f7bb4c0343f2743d0be52ba5e43a739 100644 (file)
@@ -2468,9 +2468,9 @@ class XendDomainInfo:
         self._recreateDom()
 
         # Set TSC mode of domain
-        tsc_native = self.info["platform"].get("tsc_native")
-        if arch.type == "x86" and tsc_native is not None:
-            xc.domain_set_tsc_native(self.domid, int(tsc_native))
+        tsc_mode = self.info["platform"].get("tsc_mode")
+        if arch.type == "x86" and tsc_mode is not None:
+            xc.domain_set_tsc_info(self.domid, int(tsc_mode))
 
         # Set timer configuration of domain
         timer_mode = self.info["platform"].get("timer_mode")
index d6485a59a193c4809449b5ec658d02685569125d..32b19235e1d4207ff26afeab03ebc0732a676b0b 100644 (file)
@@ -221,9 +221,9 @@ gopts.var('timer_mode', val='TIMER_MODE',
           use="""Timer mode (0=delay virtual time when ticks are missed;
           1=virtual time is always wallclock time.""")
 
-gopts.var('tsc_native', val='TSC_NATIVE',
+gopts.var('tsc_mode', val='TSC_MODE',
           fn=set_int, default=0,
-          use="""TSC mode (0=emulate TSC, 1=native TSC).""")
+          use="""TSC mode (0=default, 1=always emulate, 2=never emulate, 3=pvrdtscp).""")
 
 gopts.var('nomigrate', val='NOMIGRATE',
           fn=set_int, default=0,
@@ -738,8 +738,8 @@ def configure_image(vals):
     if vals.suppress_spurious_page_faults:
         config_image.append(['suppress_spurious_page_faults', vals.suppress_spurious_page_faults])
 
-    if vals.tsc_native is not None:
-        config_image.append(['tsc_native', vals.tsc_native])
+    if vals.tsc_mode is not None:
+        config_image.append(['tsc_mode', vals.tsc_mode])
 
     if vals.nomigrate is not None:
         config_image.append(['nomigrate', vals.nomigrate])
@@ -1036,7 +1036,7 @@ def make_config(vals):
                 config.append([n, v])
 
     map(add_conf, ['name', 'memory', 'maxmem', 'shadow_memory',
-                   'restart', 'on_poweroff', 'tsc_native', 'nomigrate',
+                   'restart', 'on_poweroff',  'tsc_mode', 'nomigrate',
                    'on_reboot', 'on_crash', 'vcpus', 'vcpu_avail', 'features',
                    'on_xend_start', 'on_xend_stop', 'target', 'cpuid',
                    'cpuid_check', 'machine_address_size', 'suppress_spurious_page_faults'])
index 9cfdb87d7ead3ea52b45c744db7664b929f0b9bd..4c0177b4aa3e27a932e9857223d3ad20fa08e624 100644 (file)
@@ -1108,7 +1108,7 @@ class sxp2xml:
             'pci_msitranslate',
             'pci_power_mgmt',
             'xen_platform_pci',
-            'tsc_native'
+            'tsc_mode'
             'description',
             'nomigrate'
         ]
index c851209853d2c091d0d83ffdbb8a03216c21dd8b..84493aba435abff29ff76f698cd2314dcc915367 100644 (file)
@@ -520,6 +520,8 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
         d->arch.cpuids[i].input[1] = XEN_CPUID_INPUT_UNUSED;
     }
 
+    /* initialize default tsc behavior in case tools don't */
+    tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
     spin_lock_init(&d->arch.vtsc_lock);
 
     return 0;
index 52f3945f06f44ac842287fe717d67e8bb505863b..4b0011a52c942ebd8420d80d15d996e4b2c5238c 100644 (file)
@@ -1101,9 +1101,10 @@ long arch_do_domctl(
     }
     break;
 
-    case XEN_DOMCTL_set_tsc_native:
+    case XEN_DOMCTL_gettscinfo:
     {
         struct domain *d;
+        xen_guest_tsc_info_t info;
 
         ret = -ESRCH;
         d = rcu_lock_domain_by_id(domctl->domain);
@@ -1111,9 +1112,34 @@ long arch_do_domctl(
             break;
 
         domain_pause(d);
-        d->arch.vtsc = !domctl->u.set_tsc_native.is_native;
-        if ( is_hvm_domain(d) )
-            hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+        tsc_get_info(d, &info.tsc_mode,
+                        &info.elapsed_nsec,
+                        &info.gtsc_khz,
+                        &info.incarnation);
+        if ( copy_to_guest(domctl->u.tsc_info.out_info, &info, 1) )
+            ret = -EFAULT;
+        else
+            ret = 0;
+        domain_unpause(d);
+
+        rcu_unlock_domain(d);
+    }
+    break;
+
+    case XEN_DOMCTL_settscinfo:
+    {
+        struct domain *d;
+
+        ret = -ESRCH;
+        d = rcu_lock_domain_by_id(domctl->domain);
+        if ( d == NULL )
+            break;
+
+        domain_pause(d);
+        tsc_set_info(d, domctl->u.tsc_info.info.tsc_mode,
+                     domctl->u.tsc_info.info.elapsed_nsec,
+                     domctl->u.tsc_info.info.gtsc_khz,
+                     domctl->u.tsc_info.info.incarnation);
         domain_unpause(d);
 
         rcu_unlock_domain(d);
index f4d342a52fc71a38231562f00feaf057810d34f0..ca567638aeeb25953471d124f3e8cc20c4d2ab43 100644 (file)
@@ -1831,7 +1831,7 @@ void hvm_cpuid(unsigned int input, unsigned int *eax, unsigned int *ebx,
     if ( cpuid_viridian_leaves(input, eax, ebx, ecx, edx) )
         return;
 
-    if ( cpuid_hypervisor_leaves(input, eax, ebx, ecx, edx) )
+    if ( cpuid_hypervisor_leaves(input, count, eax, ebx, ecx, edx) )
         return;
 
     domain_cpuid(v->domain, input, *ecx, eax, ebx, ecx, edx);
index 982ce7ca5283db0883f0bebb78cd932170886147..e3652a9d450a8b104614b2f60e8d57d4b0ad25f1 100644 (file)
@@ -34,6 +34,7 @@
 #include <asm/hpet.h>
 #include <io_ports.h>
 #include <asm/setup.h> /* for early_time_init */
+#include <public/arch-x86/cpuid.h>
 
 /* opt_clocksource: Force clocksource to one of: pit, hpet, cyclone, acpi. */
 static char __initdata opt_clocksource[10];
@@ -45,10 +46,12 @@ unsigned long pit0_ticks;
 static u32 wc_sec, wc_nsec; /* UTC time at last 'time update'. */
 static DEFINE_SPINLOCK(wc_lock);
 
+/* moved to <asm/domain.h>
 struct time_scale {
     int shift;
     u32 mul_frac;
 };
+*/
 
 struct cpu_time {
     u64 local_tsc_stamp;
@@ -150,13 +153,32 @@ static inline u64 scale_delta(u64 delta, struct time_scale *scale)
     return product;
 }
 
+#define _TS_SHIFT_IDENTITY    1
+#define _TS_MUL_FRAC_IDENTITY 0x80000000UL
+#define _TS_IDENTITY { _TS_SHIFT_IDENTITY, _TS_MUL_FRAC_IDENTITY }
+static inline int time_scale_is_identity(struct time_scale *ts)
+{
+    if ( ts->shift != _TS_SHIFT_IDENTITY )
+        return 0;
+    else if ( ts->mul_frac != _TS_MUL_FRAC_IDENTITY )
+        return 0;
+    return 1;
+}
+
+static inline void set_time_scale_identity(struct time_scale *ts)
+{
+    ts->shift = _TS_SHIFT_IDENTITY;
+    ts->mul_frac = _TS_MUL_FRAC_IDENTITY;
+}
+
 /* Compute the reciprocal of the given time_scale. */
 static inline struct time_scale scale_reciprocal(struct time_scale scale)
 {
     struct time_scale reciprocal;
     u32 dividend;
 
-    dividend = 0x80000000u;
+    ASSERT(scale.mul_frac != 0);
+    dividend = _TS_MUL_FRAC_IDENTITY;
     reciprocal.shift = 1 - scale.shift;
     while ( unlikely(dividend >= scale.mul_frac) )
     {
@@ -818,6 +840,8 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
     struct cpu_time       *t;
     struct vcpu_time_info *u, _u;
     XEN_GUEST_HANDLE(vcpu_time_info_t) user_u;
+    struct domain *d = v->domain;
+    s_time_t tsc_stamp = 0;
 
     if ( v->vcpu_info == NULL )
         return;
@@ -825,20 +849,31 @@ static void __update_vcpu_system_time(struct vcpu *v, int force)
     t = &this_cpu(cpu_time);
     u = &vcpu_info(v, time);
 
+    if ( d->arch.vtsc )
+    {
+        tsc_stamp = t->stime_local_stamp - d->arch.vtsc_offset;
+        if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+            tsc_stamp = scale_delta(tsc_stamp, &d->arch.ns_to_vtsc);
+    }
+    else
+        tsc_stamp = t->local_tsc_stamp;
+
+    if ( d->arch.tsc_mode ==  TSC_MODE_PVRDTSCP &&
+              boot_cpu_has(X86_FEATURE_RDTSCP) )
+        write_rdtscp_aux(d->arch.incarnation);
+
     /* Don't bother unless timestamps have changed or we are forced. */
-    if ( !force && (u->tsc_timestamp == (v->domain->arch.vtsc
-                                         ? t->stime_local_stamp
-                                         : t->local_tsc_stamp)) )
+    if ( !force && (u->tsc_timestamp == tsc_stamp) )
         return;
 
     memset(&_u, 0, sizeof(_u));
 
-    if ( v->domain->arch.vtsc )
+    if ( d->arch.vtsc )
     {
-        _u.tsc_timestamp     = t->stime_local_stamp;
+        _u.tsc_timestamp     = tsc_stamp;
         _u.system_time       = t->stime_local_stamp;
-        _u.tsc_to_system_mul = 0x80000000u;
-        _u.tsc_shift         = 1;
+        _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
+        _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
     }
     else
     {
@@ -1556,7 +1591,7 @@ static void tsc_check_slave(void *unused)
     local_irq_enable();
 }
 
-static void tsc_check_reliability(void)
+void tsc_check_reliability(void)
 {
     unsigned int cpu = smp_processor_id();
     static DEFINE_SPINLOCK(lock);
@@ -1583,57 +1618,245 @@ static void tsc_check_reliability(void)
 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs)
 {
     s_time_t now = get_s_time();
+    struct domain *d = v->domain;
 
-    spin_lock(&v->domain->arch.vtsc_lock);
+    spin_lock(&d->arch.vtsc_lock);
 
     if ( guest_kernel_mode(v, regs) )
-        v->domain->arch.vtsc_kerncount++;
+        d->arch.vtsc_kerncount++;
     else
-        v->domain->arch.vtsc_usercount++;
+        d->arch.vtsc_usercount++;
 
-    if ( (int64_t)(now - v->domain->arch.vtsc_last) > 0 )
-        v->domain->arch.vtsc_last = now;
+    if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
+        d->arch.vtsc_last = now;
     else
-        now = ++v->domain->arch.vtsc_last;
+        now = ++d->arch.vtsc_last;
 
-    spin_unlock(&v->domain->arch.vtsc_lock);
+    spin_unlock(&d->arch.vtsc_lock);
+
+    now = now - d->arch.vtsc_offset;
+    if ( !time_scale_is_identity(&d->arch.ns_to_vtsc) )
+        now = scale_delta(now, &d->arch.ns_to_vtsc);
 
     regs->eax = (uint32_t)now;
     regs->edx = (uint32_t)(now >> 32);
 }
 
+static int host_tsc_is_safe(void)
+{
+    extern unsigned int max_cstate;
+
+    if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
+        return 1;
+    if ( num_online_cpus() == 1 )
+        return 1;
+    if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && max_cstate <= 2 )
+    {
+        if ( !tsc_check_count )
+            tsc_check_reliability();
+        if ( tsc_max_warp == 0 )
+            return 1;
+    }
+    return 0;
+}
+
+void cpuid_time_leaf(uint32_t sub_idx, uint32_t *eax, uint32_t *ebx,
+                      uint32_t *ecx, uint32_t *edx)
+{
+    struct domain *d = current->domain;
+    struct cpu_time *t;
+
+    t = &this_cpu(cpu_time);
+    switch ( sub_idx )
+    {
+    case 0: /* features */
+        *eax = ( ( (!!d->arch.vtsc) << 0 ) |
+                 ( (!!host_tsc_is_safe()) << 1 ) |
+                 ( (!!boot_cpu_has(X86_FEATURE_RDTSCP)) << 2 ) |
+               0 );
+        *ebx = d->arch.tsc_mode;
+        *ecx = d->arch.tsc_khz;
+        *edx = d->arch.incarnation;
+        break;
+    case 1: /* pvclock group1 */ /* FIXME are these right? */
+        *eax = (uint32_t)t->local_tsc_stamp;
+        *ebx = (uint32_t)(t->local_tsc_stamp >> 32);
+        *ecx = t->tsc_scale.mul_frac;
+        *edx = d->arch.incarnation;
+        break;
+    case 2: /* pvclock scaling values */ /* FIXME  are these right? */
+        *eax = (uint32_t)t->stime_local_stamp;
+        *ebx = (uint32_t)(t->stime_local_stamp >> 32);
+        *ecx = t->tsc_scale.shift;
+        *edx = d->arch.incarnation;
+    case 3: /* physical cpu_khz */
+        *eax = cpu_khz;
+        *ebx = *ecx = 0;
+        *edx = d->arch.incarnation;
+        break;
+    }
+}
+
+/*
+ * called to collect tsc-related data only for save file or live
+ * migrate; called after last rdtsc is done on this incarnation
+ */
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
+                  uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
+                  uint32_t *incarnation)
+{
+    *incarnation = d->arch.incarnation;
+    switch ( *tsc_mode = d->arch.tsc_mode )
+    {
+    case TSC_MODE_NEVER_EMULATE:
+        *elapsed_nsec =  *gtsc_khz = 0;
+        break;
+    case TSC_MODE_ALWAYS_EMULATE:
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+        *gtsc_khz = 1000000UL;
+         break;
+    case TSC_MODE_DEFAULT:
+        if ( d->arch.vtsc )
+        {
+            *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
+            *gtsc_khz =  d->arch.tsc_khz;
+        }  else {
+            uint64_t tsc = 0;
+            rdtscll(tsc);
+            *elapsed_nsec = scale_delta(tsc,&d->arch.vtsc_to_ns);
+            *gtsc_khz =  cpu_khz;
+        }
+        break;
+    case TSC_MODE_PVRDTSCP:
+        *elapsed_nsec = get_s_time() - d->arch.vtsc_offset; /* FIXME scale? */
+        *gtsc_khz =  d->arch.tsc_khz;
+        break;
+    }
+}
+
+/*
+ * This may be called as many as three times for a domain, once when the
+ * hypervisor creates the domain, once when the toolstack creates the
+ * domain and, if restoring/migrating, once when saved/migrated values
+ * are restored.  Care must be taken that, if multiple calls occur,
+ * only the last "sticks" and all are completed before the guest executes
+ * an rdtsc instruction
+ */
+void tsc_set_info(struct domain *d,
+                  uint32_t tsc_mode, uint64_t elapsed_nsec,
+                  uint32_t gtsc_khz, uint32_t incarnation)
+{
+    if ( d->domain_id == 0 || d->domain_id == DOMID_INVALID )
+    {
+        d->arch.vtsc = 0;
+        return;
+    }
+    switch ( d->arch.tsc_mode = tsc_mode )
+    {
+    case TSC_MODE_NEVER_EMULATE:
+        gdprintk(XENLOG_G_INFO, "%s: never emulating TSC\n",__func__)
+        d->arch.vtsc = 0;
+        break;
+    case TSC_MODE_ALWAYS_EMULATE:
+        gdprintk(XENLOG_G_INFO, "%s: always emulating TSC\n",__func__)
+        d->arch.vtsc = 1;
+        d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+        set_time_scale_identity(&d->arch.vtsc_to_ns);
+        break;
+    case TSC_MODE_DEFAULT:
+        d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+        if ( (host_tsc_is_safe() && incarnation == 0) || !d->domain_id )
+        {
+            gdprintk(XENLOG_G_INFO, "%s: using safe native TSC\n",__func__)
+            /* use native TSC if initial host supports it */
+            d->arch.vtsc = 0;
+            d->arch.tsc_khz = gtsc_khz ? gtsc_khz : cpu_khz;
+            set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
+            set_time_scale_identity(&d->arch.ns_to_vtsc);
+        } else if ( gtsc_khz != 0  && gtsc_khz != 1000000UL ) {
+            gdprintk(XENLOG_G_INFO, "%s: safe native TSC on initial host,"
+                "but now using emulation\n",__func__)
+            /* was native on initial host, now emulated at initial tsc hz*/
+            d->arch.vtsc = 1;
+            d->arch.tsc_khz = gtsc_khz;
+            set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+            d->arch.ns_to_vtsc =
+                scale_reciprocal(d->arch.vtsc_to_ns);
+        } else {
+            gdprintk(XENLOG_G_INFO, "%s: unsafe TSC on initial host,"
+                "using emulation\n",__func__)
+            d->arch.vtsc = 1;
+            set_time_scale_identity(&d->arch.vtsc_to_ns);
+            set_time_scale_identity(&d->arch.ns_to_vtsc);
+        }
+        break;
+    case TSC_MODE_PVRDTSCP:
+        gdprintk(XENLOG_G_INFO, "%s: using PVRDTSCP\n",__func__)
+        if ( boot_cpu_has(X86_FEATURE_RDTSCP) && gtsc_khz != 0 ) {
+            d->arch.vtsc = 0;
+            set_time_scale(&d->arch.vtsc_to_ns, gtsc_khz * 1000 );
+        } else {
+            d->arch.vtsc = 1;
+            d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
+            set_time_scale_identity(&d->arch.vtsc_to_ns);
+        }
+        break;
+    }
+    d->arch.incarnation = incarnation + 1;
+    if ( is_hvm_domain(d) )
+        hvm_set_rdtsc_exiting(d, d->arch.vtsc || hvm_gtsc_need_scale(d));
+}
+
 /* vtsc may incur measurable performance degradation, diagnose with this */
 static void dump_softtsc(unsigned char key)
 {
     struct domain *d;
     int domcnt = 0;
+    extern unsigned int max_cstate;
 
     tsc_check_reliability();
     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
         printk("TSC marked as reliable, "
                "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
     else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
-        printk("TSC marked as constant but not reliable, "
-               "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
-    else
+    {
+        printk("TSC has constant rate, ");
+        if (max_cstate <= 2 && tsc_max_warp == 0)
+            printk("no deep Cstates, passed warp test, deemed reliable, ");
+        else
+            printk("deep Cstates possible, so not reliable, ");
+        printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+    } else
         printk("TSC not marked as either constant or reliable, "
-               "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
+               "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
     for_each_domain ( d )
     {
+        if ( d->domain_id == 0 && d->arch.tsc_mode == TSC_MODE_DEFAULT )
+            continue;
+        printk("dom%u%s: mode=%d",d->domain_id,
+                is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
+        if ( d->arch.vtsc_offset )
+            printk(",ofs=0x%"PRIx64"",d->arch.vtsc_offset);
+        if ( d->arch.tsc_khz )
+            printk(",khz=%"PRIu32"",d->arch.tsc_khz);
+        if ( d->arch.incarnation )
+            printk(",inc=%"PRIu32"",d->arch.incarnation);
         if ( !d->arch.vtsc )
+        {
+            printk("\n");
             continue;
+        }
         if ( is_hvm_domain(d) )
-            printk("dom%u (hvm) vtsc count: %"PRIu64" total\n",
-                   d->domain_id, d->arch.vtsc_kerncount);
+            printk(",vtsc count: %"PRIu64" total\n",
+                   d->arch.vtsc_kerncount);
         else
-            printk("dom%u vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
-                   d->domain_id, d->arch.vtsc_kerncount,
-                   d->arch.vtsc_usercount);
+            printk(",vtsc count: %"PRIu64" kernel, %"PRIu64" user\n",
+                   d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
         domcnt++;
     }
 
     if ( !domcnt )
-            printk("All domains have native TSC\n");
+            printk("No domains have emulated TSC\n");
 }
 
 static struct keyhandler dump_softtsc_keyhandler = {
index e42420c8a137e25d3ed35ada5c771cd6e73b1184..174dc25af41205125e1aa6884442e1d4020b2413 100644 (file)
@@ -679,8 +679,8 @@ int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
     return 1;
 }
 
-int cpuid_hypervisor_leaves(
-    uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+               uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
 {
     struct domain *d = current->domain;
     /* Optionally shift out of the way of Viridian architectural leaves. */
@@ -693,7 +693,7 @@ int cpuid_hypervisor_leaves(
     switch ( idx )
     {
     case 0:
-        *eax = base + 2; /* Largest leaf */
+        *eax = base + 3; /* Largest leaf */
         *ebx = XEN_CPUID_SIGNATURE_EBX;
         *ecx = XEN_CPUID_SIGNATURE_ECX;
         *edx = XEN_CPUID_SIGNATURE_EDX;
@@ -717,6 +717,11 @@ int cpuid_hypervisor_leaves(
             *ecx |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
         break;
 
+    case 3:
+        *eax = *ebx = *ecx = *edx = 0;
+        cpuid_time_leaf( sub_idx, eax, ebx, ecx, edx );
+        break;
+
     default:
         BUG();
     }
@@ -735,7 +740,7 @@ static void pv_cpuid(struct cpu_user_regs *regs)
 
     if ( current->domain->domain_id != 0 )
     {
-        if ( !cpuid_hypervisor_leaves(a, &a, &b, &c, &d) )
+        if ( !cpuid_hypervisor_leaves(a, c, &a, &b, &c, &d) )
             domain_cpuid(current->domain, a, c, &a, &b, &c, &d);
         goto out;
     }
@@ -815,7 +820,7 @@ static void pv_cpuid(struct cpu_user_regs *regs)
         a = b = c = d = 0;
         break;
     default:
-        (void)cpuid_hypervisor_leaves(regs->eax, &a, &b, &c, &d);
+        (void)cpuid_hypervisor_leaves(regs->eax, 0, &a, &b, &c, &d);
         break;
     }
 
index 3c122b5d443d74a52f67f97463100d22a730cce2..a80a3ede5cba9988995faff145363326e7af4c5c 100644 (file)
@@ -230,6 +230,11 @@ struct domain_mca_msrs
     spinlock_t lock;
 };
 
+struct time_scale {
+    int shift;
+    u32 mul_frac;
+};
+
 struct arch_domain
 {
 #ifdef CONFIG_X86_64
@@ -298,10 +303,17 @@ struct arch_domain
     /* For Guest vMCA handling */
     struct domain_mca_msrs vmca_msrs;
 
-    /* SoftTSC emulation */
-    bool_t vtsc;
-    s_time_t vtsc_last;
+    /* TSC management (emulation, pv, scaling, stats) */
+    int tsc_mode;            /* see include/asm-x86/time.h */
+    bool_t vtsc;             /* tsc is emulated (may change after migrate) */
+    s_time_t vtsc_last;      /* previous TSC value (guarantee monotonicity) */
     spinlock_t vtsc_lock;
+    uint64_t vtsc_offset;    /* adjustment for save/restore/migrate */
+    uint32_t tsc_khz;        /* cached khz for certain emulated cases */
+    struct time_scale vtsc_to_ns; /* scaling for certain emulated cases */
+    struct time_scale ns_to_vtsc; /* scaling for certain emulated cases */
+    uint32_t incarnation;    /* incremented every restore or live migrate
+                                (possibly other cases in the future */
     uint64_t vtsc_kerncount; /* for hvm, counts all vtsc */
     uint64_t vtsc_usercount; /* not used for hvm */
 } __cacheline_aligned;
index 56bb080f52e14fa3ee4854fb82058db446d23f16..a65f080569a26da8513105bc4e341a408472bfd7 100644 (file)
@@ -84,6 +84,8 @@ static inline void wrmsrl(unsigned int msr, __u64 val)
 
 #define write_tsc(val) wrmsrl(MSR_IA32_TSC, val)
 
+#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
+
 #define rdpmc(counter,low,high) \
      __asm__ __volatile__("rdpmc" \
                          : "=a" (low), "=d" (high) \
index 7b09adecd630066d3b0be93ed804e5491095b5ea..628965ae3eda91eeae105e8506c2efa938c95ffd 100644 (file)
@@ -550,8 +550,8 @@ asmlinkage void do_machine_check(struct cpu_user_regs *regs);
 void cpu_mcheck_distribute_cmci(void);
 void cpu_mcheck_disable(void);
 
-int cpuid_hypervisor_leaves(
-    uint32_t idx, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+int cpuid_hypervisor_leaves( uint32_t idx, uint32_t sub_idx,
+          uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val);
 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val);
 
index c72cb8e50652732dac4f966889ed8f930ca636d0..6dd071d7266327f8d871efc7c5f0ad0156240eaa 100644 (file)
@@ -4,6 +4,24 @@
 
 #include <asm/msr.h>
 
+/*
+ *  PV TSC emulation modes:
+ *    0 = guest rdtsc/p executed natively when monotonicity can be guaranteed
+ *         and emulated otherwise (with frequency scaled if necessary)
+ *    1 = guest rdtsc/p always emulated at 1GHz (kernel and user)
+ *    2 = guest rdtsc always executed natively (no monotonicity/frequency
+ *         guarantees); guest rdtscp emulated at native frequency if
+ *         unsupported by h/w, else executed natively
+ *    3 = same as 2, except xen manages TSC_AUX register so guest can
+ *         determine when a restore/migration has occurred and assumes
+ *         guest obtains/uses pvclock-like mechanism to adjust for
+ *         monotonicity and frequency changes
+ */
+#define TSC_MODE_DEFAULT          0
+#define TSC_MODE_ALWAYS_EMULATE   1
+#define TSC_MODE_NEVER_EMULATE    2
+#define TSC_MODE_PVRDTSCP         3
+
 void calibrate_tsc_bp(void);
 void calibrate_tsc_ap(void);
 
@@ -43,6 +61,16 @@ uint64_t ns_to_acpi_pm_tick(uint64_t ns);
 
 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs);
 
+void tsc_set_info(struct domain *d, uint32_t tsc_mode, uint64_t elapsed_nsec,
+                  uint32_t gtsc_khz, uint32_t incarnation);
+   
+void tsc_get_info(struct domain *d, uint32_t *tsc_mode, uint64_t *elapsed_nsec,
+                  uint32_t *gtsc_khz, uint32_t *incarnation);
+   
+
 void force_update_vcpu_system_time(struct vcpu *v);
 
+void cpuid_time_leaf(uint32_t sub_idx, unsigned int *eax, unsigned int *ebx,
+                      unsigned int *ecx, unsigned int *edx);
+
 #endif /* __X86_TIME_H__ */
index cac34776717d7ca609c9ae6d347bcf57c3a95394..88b19a4ffe1835722eadbe924447817a23791932 100644 (file)
@@ -401,11 +401,6 @@ struct xen_domctl_settimeoffset {
 typedef struct xen_domctl_settimeoffset xen_domctl_settimeoffset_t;
 DEFINE_XEN_GUEST_HANDLE(xen_domctl_settimeoffset_t);
 
-#define XEN_DOMCTL_set_tsc_native    57
-typedef struct xen_domctl_set_tsc_native {
-    uint32_t is_native; /* IN: 0: TSC is emulated; 1: TSC is host TSC */
-} xen_domctl_set_tsc_native_t;
-
 #define XEN_DOMCTL_gethvmcontext     33
 #define XEN_DOMCTL_sethvmcontext     34
 typedef struct xen_domctl_hvmcontext {
@@ -656,6 +651,22 @@ typedef struct xen_domctl_disable_migrate {
 } xen_domctl_disable_migrate_t;
 
 
+#define XEN_DOMCTL_gettscinfo    59
+#define XEN_DOMCTL_settscinfo    60
+struct xen_guest_tsc_info {
+    uint32_t tsc_mode;
+    uint32_t gtsc_khz;
+    uint32_t incarnation;
+    uint32_t pad;
+    uint64_t elapsed_nsec;
+};
+typedef struct xen_guest_tsc_info xen_guest_tsc_info_t;
+DEFINE_XEN_GUEST_HANDLE(xen_guest_tsc_info_t);
+typedef struct xen_domctl_tsc_info {
+    XEN_GUEST_HANDLE_64(xen_guest_tsc_info_t) out_info; /* OUT */
+    xen_guest_tsc_info_t info; /* IN */
+} xen_domctl_tsc_info_t;
+
 #define XEN_DOMCTL_gdbsx_guestmemio     1000 /* guest mem io */
 struct xen_domctl_gdbsx_memio {
     uint64_aligned_t pgd3val;/* optional: init_mm.pgd[3] value */
@@ -705,8 +716,8 @@ struct xen_domctl {
         struct xen_domctl_hypercall_init    hypercall_init;
         struct xen_domctl_arch_setup        arch_setup;
         struct xen_domctl_settimeoffset     settimeoffset;
-        struct xen_domctl_set_tsc_native    set_tsc_native;
         struct xen_domctl_disable_migrate   disable_migrate;
+        struct xen_domctl_tsc_info          tsc_info;
         struct xen_domctl_real_mode_area    real_mode_area;
         struct xen_domctl_hvmcontext        hvmcontext;
         struct xen_domctl_hvmcontext_partial hvmcontext_partial;